Data Loading

Load required libraries

rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(pcaPP)
library(directlabels)
library(proto)

Load in wordbank data

## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname="wordbank", host="54.149.39.46",
                      user="wordbank", password="wordbank")

## NOW LOAD TABLES ##
source.table <- tbl(wordbank, "common_source")
admin.table <- tbl(wordbank, "common_administration")
child.table <- tbl(wordbank, "common_child")
wordmapping.table <- tbl(wordbank, "common_wordmapping")
instruments.table <- tbl(wordbank, "common_instrumentsmap")
english.ws.table <- tbl(wordbank, "instruments_english_ws")
spanish.ws.table <- tbl(wordbank, "instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank, "instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank, "instruments_danish_ws")

Get kid data and put together.

# Get administration info
admins <- admin.table %>%
  select(data_id,child_id,age,source_id) %>%
  rename(id = data_id, child.id = child_id, source.id = source_id) 
admins <- as.data.frame(admins)

# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
  rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)

# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))

Set up mappings and instruments.

mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
  rename(instrument_id = id)
items <- left_join(mapping, instruments)

Fucntion for getting all of the data in wordbank for a given language (kid x item).

get.language.data <- function(lang.table, lang.items, lang, child.data) {
  
  instrument.items <- lang.items %>% 
    filter(language == lang, form == 'WS') %>%
    select(item, type, category, lexical_category, definition) %>%
    mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
  
  instrument.data <- as.data.frame(lang.table) %>%
    rename(id = basetable_ptr_id) %>% # Rename the id
    gather(item, value, -id) %>% # Arrange in longform
    mutate(item = str_replace(item, "item_", "")) # Strip off item_ 
  
  d <- left_join(instrument.data, instrument.items)
  d <- left_join(d, child.data)
  }

Get (kid x item) data for all languages.

d.english <- get.language.data(lang.table=english.ws.table, 
                               lang.items=items, 
                               lang="English",
                               child.data)

d.spanish <- get.language.data(lang.table=spanish.ws.table, 
                               lang.items=items, 
                               lang="Spanish",
                               child.data)

d.norwegian <- get.language.data(lang.table=norwegian.ws.table, 
                                 lang.items=items, 
                                 lang="Norwegian",
                                 child.data)

# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
            & is.na(d.norwegian$value),]$value = ""

d.danish <- get.language.data(lang.table=danish.ws.table, 
                              lang.items=items, 
                              lang="Danish",
                              child.data)

# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
         & is.na(d.danish$value),]$value = ""

Function for getting vocab size data.

language.vocab.sizes <- function(lang.data) {
  d.vocab <- lang.data %>%
    filter(type == "word") %>%
    group_by(age,id) %>%
    summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
              vocab.mean = vocab.sum/length(value))
  
  return(d.vocab)
  }

Syntax and Morphology Analyses

Function for getting (kid x {vocab size, syntax score, morphology score}) data.

summarise.language.data <- function(lang.data,lang) {
  
  d.vocab <- language.vocab.sizes(lang.data)
  
  d.complexity <- lang.data %>%
    filter(type == "complexity") %>%
    group_by(id) %>%
    summarise(all.na = all(is.na(value)),
              complexity.sum = sum(value == "complex", 
                                   na.rm=TRUE) / length(value)) %>%
    mutate(complexity = ifelse(all.na,NA,complexity.sum)) %>%
    select(-all.na,-complexity.sum) # Deals with ifelse 
  # forcing values to logical
  
  d.wordform <- lang.data %>%
    filter(type == "word_form") %>%
    group_by(id) %>%
    summarise(all.na = all(is.na(value)),
              wordform.sum = sum(value == "produces", 
                                 na.rm=TRUE) / length(value)) %>%
    mutate(wordform = ifelse(all.na,NA,wordform.sum)) %>%
    select(-all.na,-wordform.sum) # Deals with ifelse 
  # forcing values to logical
  
  # Spanish doesn't have ending data, so its skipped, at least for now.
  #   d.ending <- d %>%
  #     filter(type %in% c("ending")) %>%
  #     group_by(id) %>%
  #     summarise(ending_sometimes = mean(value == "sometimes" | 
  #                                       value == "often", 
  #                                       na.rm=TRUE), 
  #               ending_often = mean(value == "often", 
  #                                   na.rm=TRUE))
  #  d.composite <- left_join(d.composite, d.ending)
  
  d.composite <- left_join(d.vocab, d.complexity)
  d.composite <- left_join(d.composite, d.wordform) %>%
      ungroup() %>%
      filter(age > 15 & age < 32) %>%
      mutate(age.group = cut(age, breaks = c(15, 19, 23, 27, 31)),
             age.bin = cut(age, quantile(age), include.lowest=T))

  d.composite$age.bin <- factor(d.composite$age.bin, labels=c(1,2,3,4))
  #   %>%
  #     filter(num.complexity.na == 0) %>%
  #     select(-num.complexity.na)
  #   
  d.composite$language <- lang
  
  return(d.composite)
  }

Get (kid x {vocab size, syntax score, morphology score}) data for all languages and aggregate them.

summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")
 
summary.data <- rbind_list(summary.english, summary.spanish,
                           summary.norwegian, summary.danish) %>%
  mutate(language = factor(language, levels=c("English", "Spanish", 
                                              "Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
  mutate(measure = factor(measure, levels = c("wordform","complexity"),
                          labels = c("Word Form", "Complexity")))
#ms %>% 
#  group_by(language, age.bin) %>% 
#  summarise(n = n())

Using Age and Vocab to predict Morphology and Syntax Scores.

#quartz(width=8,height=7.5)
#ggplot(ms, aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
#               label = age.group)) + 
ggplot(ms, aes(x = vocab.mean, y = score, colour = age.bin, fill = age.bin,
               label = age.bin)) + 
  #geom_point(alpha=.5, size=.8) + 
  geom_jitter(alpha=.6, ssize=.8) +
  geom_smooth(method="lm", formula = y ~ I(x^2) - 1) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
                     name = "Vocabulary Size") + 
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     "Score (Mean Items)") + 
  theme_bw(base_size = 14) +
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1") 

Using Morphology scores to predict Syntax scores.

#quartz(width=8,height=7.5)
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
                        label=age.group)) + 
  facet_wrap( ~ language) +
  geom_jitter(size=1)+
  geom_smooth(method="lm", formula = y ~ exp(x) - 1) + 
  scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),
                     name = "Morphology Score") + 
  scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") + 
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1") +
  theme_bw(base_size = 14)

Examine relationship between vocab size, age, and syntax/morphology scores.

# fit regressions to data
english.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1,
                     data=filter(ms,language=="English"))
spanish.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1, 
                     data=filter(ms,language=="Spanish"))
norwegian.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1, 
                       data=filter(ms,language=="Norwegian"))
danish.lm.grp <- lm(score ~ I((vocab.mean*100)^2) * age.group * measure - age.group - measure - 1, 
                    data=filter(ms,language=="Danish"))

english.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1,
                     data=filter(ms,language=="English"))
spanish.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1, 
                     data=filter(ms,language=="Spanish"))
norwegian.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1, 
                       data=filter(ms,language=="Norwegian"))
danish.lm.bin <- lm(score ~ I((vocab.mean*100)^2) * age.bin * measure - age.bin - measure - 1, 
                    data=filter(ms,language=="Danish"))

english.lm.noage <- lm(score ~ I((vocab.mean*100)^2) * measure - measure - 1,
                       data=filter(ms,language=="English"))
spanish.lm.noage <- lm(score ~ I((vocab.mean*100)^2) * measure - measure - 1, 
                       data=filter(ms,language=="Spanish"))
norwegian.lm.noage <- lm(score ~ I((vocab.mean*100)^2) * measure - measure - 1, 
                         data=filter(ms,language=="Norwegian"))
danish.lm.noage <- lm(score ~ I((vocab.mean*100)^2) * measure - measure - 1, 
                      data=filter(ms,language=="Danish"))

predicted.data <- as.data.frame(ms)

predicted.data$predicted.grp <- NA
predicted.data$predicted.bin <- NA

predicted.data[predicted.data$language=="English",]$predicted.grp <- 
  predict.lm(english.lm.grp, predicted.data[predicted.data$language=="English",])
predicted.data[predicted.data$language=="Spanish",]$predicted.grp <- 
  predict.lm(spanish.lm.grp, predicted.data[predicted.data$language=="Spanish",])
predicted.data[predicted.data$language=="Norwegian",]$predicted.grp <- 
  predict.lm(norwegian.lm.grp, predicted.data[predicted.data$language=="Norwegian",])
predicted.data[predicted.data$language=="Danish",]$predicted.grp <- 
  predict.lm(danish.lm.grp, predicted.data[predicted.data$language=="Danish",])

predicted.data[predicted.data$language=="English",]$predicted.bin <- 
  predict.lm(english.lm.bin, predicted.data[predicted.data$language=="English",])
predicted.data[predicted.data$language=="Spanish",]$predicted.bin <- 
  predict.lm(spanish.lm.bin, predicted.data[predicted.data$language=="Spanish",])
predicted.data[predicted.data$language=="Norwegian",]$predicted.bin <- 
  predict.lm(norwegian.lm.bin, predicted.data[predicted.data$language=="Norwegian",])
predicted.data[predicted.data$language=="Danish",]$predicted.bin <- 
  predict.lm(danish.lm.bin, predicted.data[predicted.data$language=="Danish",])

Replot original correlation with fitted model.

#quartz(width=6,height=6)
ggplot(predicted.data, aes(x = vocab.mean, y = score, 
                           colour = age.group, fill = age.group,
                           label = age.group)) + 

  geom_jitter(alpha=.6, size=.8, pch="o") +
  geom_line(aes(y=predicted.grp),size=0.8) + 
#  facet_grid(measure~language) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "Vocabulary Size") + 
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     "Score (Mean Items)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  scale_color_brewer(palette="Set1",
                     name="Age Group (months)") +
#                     labels=c("16-19","20-23","24-27","28-31")) +
  scale_fill_brewer(palette="Set1",
                    guide=FALSE)

#ggsave(file=("grammar.pdf"), width=6, height=6)

Same plot but with equal number of kids age bins.

#quartz(width=6,height=6)
ggplot(predicted.data, aes(x = vocab.mean, y = score, 
                           colour = age.bin, fill = age.bin,
                           label = age.bin)) + 

  geom_jitter(alpha=.6, size=.8, pch="o") +
  geom_line(aes(y=predicted.bin),size=0.8) + 
#  facet_grid(measure~language) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "Vocabulary Size") + 
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     "Score (Mean Items)") + 
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom") +
  scale_color_brewer(palette="Set1",
                     name="Age Group (months)") +
#                     labels=c("16-19","20-23","24-27","28-31")) +
  scale_fill_brewer(palette="Set1",
                    guide=FALSE)

#ggsave(file=("grammar.pdf"), width=6, height=6)
#plot.measure <- function(meas) {
#plot <- ggplot(filter(predicted.data, measure==meas),
#               aes(x = vocab.mean, y = score, 
#               colour = age.group, fill = age.group,
#               label = age.group)) +
#  geom_jitter(alpha=.6,size=.8, pch="o") +
#  geom_line(aes(y=predicted.grp),size=0.9) + 
#  facet_grid(language~.) + 
#  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
#                     name = "Vocabulary Size") + 
#  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
#                     paste(meas, "Score (Mean Items)")) + 
#  theme_bw(base_size = 11) +
#  scale_color_brewer(palette="Set1",
#                     name="Age Group\n (months)",
#                     labels=c("16-19","20-23","24-27","28-31")) +
#  scale_fill_brewer(palette="Set1",
#                    guide=FALSE) +
#  theme(legend.position = "bottom",
#        legend.background = element_rect(colour = "grey"))
#return(plot)
#}
#quartz(width=4,height=7)
#plot.measure("Complexity") %>%
#ggsave(file=("complexity.pdf"), width=4, height=7)

#quartz(width=4,height=7)
#plot.measure("Word Form") %>%
#ggsave(file=("wordform.png"), width=4, height=7)

Compute (vocab x age) interaction coefficients for each measure and language.

mod.coef.fun <- function(score, vocab.mean, age) {
  return(coef(lm(score ~ I((vocab.mean*100)^2)*age+0 - age))[2])
  }

mod.se.fun <- function(score, vocab.mean, age) {
  return(summary(lm(score ~ I((vocab.mean*100)^2)*age+0 - age))$coefficients[2,2])
  }

coefs <- ms %>% 
  group_by(language, measure) %>%
  summarise(coef = mod.coef.fun(score,vocab.mean,age),
            se = mod.se.fun(score,vocab.mean,age))

Plot coefficients for each language.

#quartz(width=6, height=4)
ggplot(coefs, 
       aes(x=language, y=coef, fill=measure)) + 
  geom_bar(position="dodge", stat="identity") + 
  geom_linerange(aes(ymin=coef-se, ymax=coef+se), 
                 position = position_dodge(width=.9)) +
  ylab("Age interaction coefficient") + 
  xlab("Language") +
  theme(legend.position = "bottom") +
  theme_bw(base_size = 14) +
  scale_fill_brewer(palette = "Set1",
                    name="")

#ggsave(file=("coeffs.pdf"), width=6, height=4)

Get (kid x item x {vocab size, value}) data for wordform and complexity items.

grammar.by.item <- function(lang.data, lang) {
  
  d.vocab <- language.vocab.sizes(lang.data)
  
  d.complexity <- lang.data %>%
    filter(type == "complexity") %>%
    mutate(value = value == "complex") %>%
    select(-lexical_category, -category)
  
  d.wordform <- lang.data %>%
    filter(type == "word_form") %>%
    mutate(value = value == "produces") %>%
    select(-lexical_category, -category)
  
  d.data <- rbind(d.complexity, d.wordform)
  d.data <- left_join(d.vocab, d.data) %>%
    filter(age > 15, age < 33)
  
  return(d.data)
  }

english.grammar.by.item <- grammar.by.item(d.english,"English")
spanish.grammar.by.item <- grammar.by.item(d.spanish,"Spanish")
norwegian.grammar.by.item <- grammar.by.item(d.norwegian,"Norwegian")
danish.grammar.by.item <- grammar.by.item(d.danish,"Danish")

Compute (vocab x age) interaction terms for each wordform and complexity item.

#compute interaction terms each item
i.terms.function <- function(data,x) {
  return(summary(glm(value ~ I((vocab.mean*100)^2)*age+0, 
                     data=filter(data,definition==x),
                     family="binomial"))$coefficients[3,3])
  }

complexity.diff <- function(item) {
  
  if(length(grep("/", item)) == 0) {return(item)}
  else{
    phrases <- str_split(item," / ")[[1]]
    first.phrase <- str_split(phrases[1], " ")[[1]]
    second.phrase <- str_split(phrases[2], " ")[[1]]
  
    first.diff <- setdiff(first.phrase,second.phrase)
    second.diff <- setdiff(second.phrase,first.phrase)
  
    if(length(first.diff)==0) return(paste(second.diff,collapse=" ")) 
    else if(length(second.diff)==0) return(paste(first.diff,collapse=" "))
    else{first.phrase = paste(first.diff, collapse =" ")
         second.phrase = paste(second.diff,collapse = " ")
         return(paste(first.phrase, second.phrase ,sep=" / "))}
    }
  }

lang.interaction.terms<- function(grammar.by.item) {
  
  complexity.terms <- sapply(unique(filter(grammar.by.item,
                                           type=="complexity")$definition), 
                             function(item) i.terms.function(grammar.by.item,
                                                             item))
  
  complexity.terms <- data.frame(type = "complexity",
                                 definition=names(complexity.terms),
                                 item=1:length(complexity.terms),
                                 term = complexity.terms,
                                 row.names=NULL) %>%
    mutate(definition= sapply(definition,complexity.diff),
           item = paste(item,definition,sep=". ")) %>%
    select(-definition)
  
  wordform.terms <- sapply(unique(filter(grammar.by.item,
                                         type=="word_form")$definition), 
                           function(item) i.terms.function(grammar.by.item,
                                                           item))
  
  wordform.terms <- data.frame(type = "wordform",
                               item=names(wordform.terms),
                               term = wordform.terms,
                               row.names=NULL)
  
  #rename results to be human-readable
  interaction.terms <-rbind(complexity.terms,wordform.terms) %>%
    arrange(term) %>%
    mutate(item = factor(item,levels=item))
  
  return(interaction.terms)
  }

spanish.grammar.by.item$definition <- spanish.grammar.by.item$item
norwegian.grammar.by.item$definition <- norwegian.grammar.by.item$item
danish.grammar.by.item$definition <- danish.grammar.by.item$item

english.interaction.terms <- lang.interaction.terms(english.grammar.by.item)
spanish.interaction.terms <-lang.interaction.terms(spanish.grammar.by.item)
norwegian.interaction.terms <- lang.interaction.terms(norwegian.grammar.by.item)
danish.interaction.terms <-  lang.interaction.terms(danish.grammar.by.item)

Plot interaction terms by item for each language.

interaction.plot <- function(lang.interaction.terms, lang) {
  plt <- ggplot(lang.interaction.terms,
                aes(x=item,y=term,fill=type,label=item)) +
    geom_bar(stat="identity", position="identity", alpha=.5) +
    geom_text(y=0.15, angle=90, hjust=0, size=3.5) +
    theme_bw(base_size = 14) +
    scale_y_continuous(name="Age x Vocabulary Interaction Z-score") +
    #                     limits=c(-10,20),
    #                     breaks=seq(-10,20,2.5)) +
    scale_x_discrete(name="CDI Item",breaks=NULL) +
    scale_fill_brewer(palette="Set1") +
    scale_colour_brewer(palette="Set1") +
    theme(legend.position="bottom") +
    ggtitle(lang)
  return(plt)
  }
interaction.plot(english.interaction.terms, "English") #%>%

#ggsave(file=("english_interactions.png"), width=10, height=5)
interaction.plot(spanish.interaction.terms, "Spanish") #%>%

#ggsave(file=("spanish_interactions.png"), width=10, height=5)
interaction.plot(norwegian.interaction.terms, "Norwegian") #%>%

#ggsave(file=("norwegian_interactions.png"), width=10, height=5)
interaction.plot(danish.interaction.terms, "Danish") #$>$

#ggsave(file=("danish_interactions.png"), width=10, height=5)

Vocabulary Composition Analysis

Function for computing vocabulary composition for each speaker of a language.

vocab.composition <- function(lang.data,lang) {  
  
  d.vocab <- language.vocab.sizes(lang.data)
  
  d.cat <- lang.data %>%
    filter(type == "word") %>%
    group_by(id,lexical_category) %>%
    summarise(cat = sum(value == "produces", na.rm=TRUE))
  
  d.vocab.comp <- left_join(d.vocab, d.cat) %>%
    mutate(prop = cat / vocab.sum)
  d.vocab.comp$language = lang
  
  return(d.vocab.comp)
  }

Function for computing CDI form composition for all languages.

lang.vocab.composition <- function(lang.items) {  
  
  lang.words <- lang.items %>%
    filter(form == "WS",type=="word")
  
  lang.num.total <- lang.words %>%
    group_by(language) %>%
    summarise(n = n())
  
  lang.vocab.comp <-  lang.words %>%
    group_by(language,lexical_category) %>%
    summarise(num.per.cat = n())
  
  lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
    mutate(prop.per.cat = num.per.cat/n)
  
  return(lang.vocab.comp)
  
  }

Get vocabulary composition data for all languages.

# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
  ungroup() %>%
  mutate(language = factor(language,
                           levels = c("English", "Spanish", 
                                      "Norwegian", "Danish")),
         lexical_category = factor(lexical_category, 
                                   levels=c("nouns","predicates",
                                            "function_words","other"),
                                   labels=c("Nouns", "Predicates",
                                            "Function Words","Other"))) %>%
  filter(lexical_category != "Other")
  

# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")

# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
                                 vocab.comp.norwegian,vocab.comp.danish) %>%
#  filter(age > 15 & age < 33) %>%
  mutate(#age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
         language = factor(language,
                           levels = c("English", "Spanish", 
                                      "Norwegian", "Danish")),
         lexical_category = factor(lexical_category, 
                                   levels = c("nouns", "predicates", 
                                              "function_words", "other"),
                                   labels = c("Nouns", "Predicates", 
                                              "Function Words", "Other")))
summary.vocab.comp <- left_join(ms, summary.vocab.comp)

Plot vocabulary composition by language.

ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour=lexical_category, 
           shape = lexical_category, fill = lexical_category,
           label=lexical_category)) +
  geom_point(size = 1, alpha = 0.25) +
  facet_wrap(~ language) +
  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")

Plot vocabulary composition by language and age group.

ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour=lexical_category, 
           shape = lexical_category, fill = lexical_category,
           label = lexical_category)) +
  geom_jitter(size = 1, alpha = 0.5) +
  facet_grid(language ~ age.group) +
  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")

Plot vocabulary composition by language, split by age group.

#quartz(width=7, height=6)
ggplot(filter(summary.vocab.comp, lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour=age.group, linetype = lexical_category)) +
#  geom_jitter(size = 1, alpha = 0.5) +
  facet_wrap(~ language) +
  geom_hline(data=lang.vocab.comp, 
             aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(method='loess', span=.5) +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0,1,.2),
                     name = "Vocabulary Size") + 
  scale_y_continuous(limits = c(0, .6), breaks = seq(0,1,.2),
                     "Proportion of total vocabulary") + 
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1", name = "Age Group (months)") +
  scale_fill_brewer(palette = "Set1") +
  scale_linetype(name = "Lexical Category") +
  theme(#axis.text.x = element_text(angle=-40, hjust = 0),
#        axis.title.y = element_text(vjust=0.35),
#        axis.title.x = element_text(vjust=-0.5),
        legend.position="bottom")

#ggsave(file=("age_composition.pdf"), width=7, height=6)

Plot vocabulary composition by language and lexical category.

#quartz(width=8, height=6)
ggplot(filter(summary.vocab.comp, lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour = age.group, fill = age.group)) +
#  geom_jitter(size = 1, alpha = 0.5) +
  facet_grid(language ~ lexical_category) +
  geom_hline(data=lang.vocab.comp, 
             aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=age.group), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")

#ggsave(file=("age_composition.pdf"), width=8, height=6)

Re-compute vocabulary composition data as a proportion of items on the CDI rather than a proportion of vocabulary size.

prop.comp <- left_join(filter(summary.vocab.comp, lexical_category != "Other"),
                        lang.vocab.comp) %>%
  mutate(cdi.prop = cat / num.per.cat) %>%
  select(-prop, -prop.per.cat, -n, -num.per.cat)

Use Age and Lexical Category Score to predict Morphology and Syntax Scores, for each lexical category.

plot.vocab.comp.prediction <- function(category) {
  p <- ggplot(filter(prop.comp, lexical_category==category),
              aes(x = cdi.prop, y = score, colour = age.group, fill = age.group,
                  label = age.group)) + 
        #geom_point(alpha=.5, size=.8) + 
    geom_jitter(alpha=.5,size=.8) +
    geom_smooth(method="lm", formula = y ~ I(x^2) - 1) + 
    facet_grid(language ~ measure) + 
    scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
                       name = "Category Size") + 
    scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                       "Score (Mean Items)") + 
    theme_bw(base_size = 14) +
    ggtitle(category) +
    scale_color_brewer(palette="Set1") +
    scale_fill_brewer(palette="Set1") 
  
  return(p)
}
plot.vocab.comp.prediction("Nouns")

plot.vocab.comp.prediction("Predicates")

plot.vocab.comp.prediction("Function Words")

Fit models to vocab composition data.

comp.lm <- function(lang) {
  
  prop.comp.lang <- filter(prop.comp, language == lang, lexical_category != "Other")
  comp.age.lm <- lm(cdi.prop ~ lexical_category : I((vocab.mean*100)^2):age.group +
                           lexical_category : I((vocab.mean*100)):age.group + 0,
                         data=prop.comp.lang)
  comp.noage.lm <- lm(cdi.prop ~ lexical_category : I((vocab.mean*100)^2) + 
                           lexical_category : I((vocab.mean*100)) + 0,
                         data=prop.comp.lang)
  prop.comp.lang$age.predict <- predict.lm(comp.age.lm)
  prop.comp.lang$noage.predict <- predict.lm(comp.noage.lm)
  prop.comp.lang$language <- lang
  return(prop.comp.lang)
  }

comp.english <- comp.lm("English")
comp.spanish <- comp.lm("Spanish")
comp.norweigian <- comp.lm("Norwegian")
comp.danish <- comp.lm("Danish")

comp.all <- rbind(comp.english, comp.spanish, comp.norweigian, comp.danish) %>%
  mutate(language = factor(language,
                           levels = c("English", "Spanish", "Norwegian", "Danish")))

Plot predictions of model with age.

#quartz(width=10, height=6)
ggplot(comp.all,
       aes(x=vocab.mean, y=cdi.prop, colour=age.group, linetype=lexical_category)) +
#       aes(x=vocab.mean, y=cdi.prop, colour=lexical_category, 
#           shape=lexical_category, fill=lexical_category,
#           label=lexical_category)) +
#  geom_jitter(size=0.7, alpha=0.2, pch="o") +
  geom_line(aes(y = age.predict)) +
  facet_wrap(~language) +
#  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
#             linetype="dashed", color="grey") + #baselines for each language
  #geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of CDI Category") +
  scale_x_continuous(name = "Vocabulary Size") +
#  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1", name = "Age Group (months)") +
  scale_linetype(name = "Lexical Category") +
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="right")

#ggsave(file=("composition_model_age_points.png"), width=10, height=6)

Plot predictions of model without age.

#quartz(width=10, height=6)
ggplot(comp.all,
       aes(x=vocab.mean, y=cdi.prop, colour=age.group, linetype=lexical_category)) +
#       aes(x=vocab.mean, y=cdi.prop, colour=lexical_category, 
#           shape=lexical_category, fill=lexical_category,
#           label=lexical_category)) +
#  geom_jitter(size=0.7, alpha=0.2, pch="o") +
  geom_line(aes(y = noage.predict)) +
  facet_wrap(~language) +
#  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
#             linetype="dashed", color="grey") + #baselines for each language
  #geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of CDI Category") +
  scale_x_continuous(name = "Vocabulary Size") +
#  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1", name = "Age Group (months)") +
  scale_linetype(name = "Lexical Category") +
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="right")

#ggsave(file=("composition_model_age_points.png"), width=10, height=6)

Plot new vocabulary composition by language.

#quartz(width=8, height=6)
ggplot(filter(prop.comp, lexical_category != "Other"),
       aes(x=vocab.mean, y=cdi.prop, colour=lexical_category, 
           shape=lexical_category, fill=lexical_category,
           label=lexical_category)) +
  geom_jitter(size=0.7, alpha=0.2, pch="o") +
  facet_wrap(~ language) +
#  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
#             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of CDI Category") +
  scale_x_continuous(name = "Vocabulary Size") +
#  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1", name = "Lexical Category") +
  scale_fill_brewer(palette = "Set1", guide = FALSE) +
  scale_shape_discrete(guide = FALSE) +
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="bottom")

#ggsave(file=("composition.pdf"), width=8, height=6)